import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings
import plotly.io as pio
import plotly.express as px
import scipy.cluster.hierarchy as hierarchy
from collections import namedtuple
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
sys.path.append("../")
from scripts.order_operations import get_min_max_dt, get_order_details # Shows error but its ok, trust me
from scripts.optimizers_mp import k_means_optimizer # Same
from scripts.optimizers import dbscan_optimizer # Same Same
data_path = "../data"
pio.renderers.default = "notebook_connected"
load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')
# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not
try:
pc_dpi = int(os.getenv('DPI'))
except TypeError:
pc_dpi = 100
if pc_dpi is None:
pc_dpi = 100
The attempt of this notebook is focused on the widespread approach : RFM (Recency Frequency Monetary). Usual classifications using traditional (and fundamentally flawed) approaches will likely be attempted, like the Paretto distribution.
Using RMF approach, we will first attempt to distinguish classes with the data AS IS - This is expected to produce poor results. We will then evolve towards classifying machine learning algorithms which use more resources but produce, in general, better results.
It is expected that the data provided by this approach will be insufficient but will provide a good raw material to improve the models by improving the data quality and the algorithms.
The usage of RFM presupposes that we have the data to calculate the 3 indicators (R F & M). Not all datasets will be required for this step and there is no need to clog up the memory as the calculation time/resource is expected to go up.
Will be required :
The Recency will be defined by the time separating the most recent update from a Cx and the most recent update known - In this approach, we assume that the latest order placed is the most recent order (instead of using t0 = today).
The Frequency will be : number_of_orders/membership_time. membership_time is : time elapsed between first order of account and last general order(not last order of account).
Monetary will be the total of all item prices of all orders placed by a customer, Kaggle shows that freight is always paid (case where Cx orders 10 times the same item from the same seller --> Cx will pay 10 times the freight price).
olist_customers_file = "../data/optimized/olist_customers.csv"
olist_orders_file = "../data/optimized/olist_orders.csv"
olist_order_items_file = "../data/optimized/olist_order_items.csv"
df_customers = pd.read_csv(filepath_or_buffer=olist_customers_file)
df_orders = pd.read_csv(filepath_or_buffer=olist_orders_file)
df_orders_items = pd.read_csv(filepath_or_buffer=olist_order_items_file)
# Dtypes were not carried over and will need to be enforced
# Or using pickles rather than CSV
df_customers["customer_id"] = df_customers["customer_id"].astype(np.uint32)
df_customers["customer_unique_id"] = df_customers["customer_unique_id"].astype(np.uint32)
df_customers.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 99441 entries, 0 to 99440 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 99441 non-null uint32 1 customer_unique_id 99441 non-null uint32 2 customer_zip_code_prefix 99441 non-null int64 3 customer_city 99441 non-null object 4 customer_state 99441 non-null object dtypes: int64(1), object(2), uint32(2) memory usage: 3.0+ MB
df_customers.head()
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|
| 0 | 1 | 1 | 14409 | franca | SP |
| 1 | 2 | 2 | 9790 | sao bernardo do campo | SP |
| 2 | 3 | 3 | 1151 | sao paulo | SP |
| 3 | 4 | 4 | 8775 | mogi das cruzes | SP |
| 4 | 5 | 5 | 13056 | campinas | SP |
date_cols = [
"order_purchase_dt", "order_approved_at",
"order_delivered_carrier_date", "order_delivered_customer_date",
"order_estimated_delivery_date"
]
for col in date_cols:
df_orders[col] = pd.to_datetime(df_orders[col])
df_orders["order_id"] = df_orders["order_id"].astype(np.uint32)
df_orders["customer_id"] = df_orders["customer_id"].astype(np.uint32)
df_orders.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 99441 entries, 0 to 99440 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 99441 non-null uint32 1 customer_id 99441 non-null uint32 2 order_status 99441 non-null object 3 order_purchase_dt 99441 non-null datetime64[ns] 4 order_approved_at 99281 non-null datetime64[ns] 5 order_delivered_carrier_date 97658 non-null datetime64[ns] 6 order_delivered_customer_date 96476 non-null datetime64[ns] 7 order_estimated_delivery_date 99441 non-null datetime64[ns] dtypes: datetime64[ns](5), object(1), uint32(2) memory usage: 5.3+ MB
df_orders.head()
| order_id | customer_id | order_status | order_purchase_dt | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 70297 | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 2 | 77028 | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 3 | 555 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 4 | 61082 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | 5 | 67264 | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
df_orders_items["order_id"] = df_orders_items["order_id"].astype(np.uint32)
df_orders_items["order_item_id"] = df_orders_items["order_item_id"].astype(np.uint32)
df_orders_items["product_id"] = df_orders_items["product_id"].astype(np.uint32)
df_orders_items["seller_id"] = df_orders_items["seller_id"].astype(np.uint32)
df_orders_items["shipping_limit_date"] = pd.to_datetime(df_orders_items["shipping_limit_date"])
df_orders_items.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 112650 entries, 0 to 112649 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 112650 non-null uint32 1 order_item_id 112650 non-null uint32 2 product_id 112650 non-null uint32 3 seller_id 112650 non-null uint32 4 shipping_limit_date 112650 non-null datetime64[ns] 5 price 112650 non-null float64 6 freight_value 112650 non-null float64 dtypes: datetime64[ns](1), float64(2), uint32(4) memory usage: 4.3 MB
df_orders_items.head()
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 85268 | 1 | 25866 | 514 | 2017-09-19 09:45:35 | 58.90 | 13.29 |
| 1 | 71854 | 1 | 27231 | 472 | 2017-05-03 11:05:13 | 239.90 | 19.93 |
| 2 | 6299 | 1 | 22625 | 1825 | 2018-01-18 14:48:30 | 199.00 | 17.87 |
| 3 | 22551 | 1 | 15404 | 2024 | 2018-08-15 10:10:18 | 12.99 | 12.79 |
| 4 | 5248 | 1 | 8863 | 1598 | 2017-02-13 13:57:51 | 199.90 | 18.14 |
# Starting point can be orders df, we will remove infos as we need
df_orders.head()
| order_id | customer_id | order_status | order_purchase_dt | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 70297 | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 2 | 77028 | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 3 | 555 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 4 | 61082 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | 5 | 67264 | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
# Cx id, order_id, order_purchase_dt are useful, the rest can go
df_orders = df_orders[["order_id", "customer_id", "order_purchase_dt"]]
df_orders["customer_uid"] = np.uint32(0)
df_orders["sum_total"] = np.nan
df_orders.head()
| order_id | customer_id | order_purchase_dt | customer_uid | sum_total | |
|---|---|---|---|---|---|
| 0 | 1 | 70297 | 2017-10-02 10:56:33 | 0 | NaN |
| 1 | 2 | 77028 | 2018-07-24 20:41:37 | 0 | NaN |
| 2 | 3 | 555 | 2018-08-08 08:38:49 | 0 | NaN |
| 3 | 4 | 61082 | 2017-11-18 19:28:06 | 0 | NaN |
| 4 | 5 | 67264 | 2018-02-13 21:18:39 | 0 | NaN |
order_ids = df_orders["order_id"].unique()
for order_id in order_ids:
index = df_orders.index[df_orders["order_id"] == order_id][0]
order_value = df_orders_items[df_orders_items["order_id"] == order_id]["price"].sum()
freight_value = df_orders_items[df_orders_items["order_id"] == order_id]["freight_value"].sum()
df_orders.at[index, "sum_total"] = order_value + freight_value
# Setting customers unique ids instead of cx_ids
cx_ids = df_orders["customer_id"]
for cx_id in cx_ids:
index = df_orders.index[df_orders["customer_id"] == cx_id][0]
cx_uid = df_customers[df_customers["customer_id"] == cx_id]["customer_unique_id"]
df_orders.at[index, "customer_uid"] = cx_uid
df_orders.head()
| order_id | customer_id | order_purchase_dt | customer_uid | sum_total | |
|---|---|---|---|---|---|
| 0 | 1 | 70297 | 2017-10-02 10:56:33 | 68585 | 38.71 |
| 1 | 2 | 77028 | 2018-07-24 20:41:37 | 74977 | 141.46 |
| 2 | 3 | 555 | 2018-08-08 08:38:49 | 555 | 179.12 |
| 3 | 4 | 61082 | 2017-11-18 19:28:06 | 59790 | 72.20 |
| 4 | 5 | 67264 | 2018-02-13 21:18:39 | 65715 | 28.62 |
rfm_cols = ["customer_uid", "order_id_list", "most_ancient_order_dt", "most_recent_order_dt", "recency", "frequency", "monetary"]
df_rfm = pd.DataFrame(columns=rfm_cols)
uniques = df_orders["customer_uid"].unique()
uniques.sort()
df_rfm["customer_uid"] = uniques
for uid in uniques:
index = df_rfm.index[df_rfm["customer_uid"] == uid][0]
details = get_order_details(cx_uid=uid, uid_col="customer_uid", from_frame=df_orders)
df_rfm.at[index, "order_id_list"] = details["order_list"]
df_rfm.at[index, "monetary"] = details["total_spent"]
for tuple in df_rfm.itertuples():
index = tuple.Index
order_list = tuple.order_id_list
min_max_dt = get_min_max_dt(order_list=order_list, from_frame=df_orders, dt_col="order_purchase_dt")
df_rfm.at[index, "most_ancient_order_dt"] = min_max_dt["min"]
df_rfm.at[index, "most_recent_order_dt"] = min_max_dt["max"]
most_recent_global = df_rfm["most_recent_order_dt"].max()
def get_recency(row, most_recent_global) -> pd.Timedelta:
"""
Returns timedelta in seconds between most recent purchase global and most recent purchase cx
"""
return (most_recent_global - row["most_recent_order_dt"]).total_seconds()
def get_frequency(row, most_recent_global):
"""
Returns avg. purchases made per active month
"""
account_timespan = most_recent_global - row["most_ancient_order_dt"]
try:
return len(row["order_id_list"]) / (account_timespan.total_seconds() / 2419200) # Purchase per active month
except ZeroDivisionError:
return 0
def get_num_order(row):
return len(row["order_id_list"])
df_rfm["recency"] = df_rfm.apply(get_recency, axis=1, args=(most_recent_global, ))
df_rfm["frequency"] = df_rfm.apply(get_frequency, axis=1, args=(most_recent_global, ))
df_rfm["num_orders"] = df_rfm.apply(get_num_order, axis=1)
df_rfm.head()
| customer_uid | order_id_list | most_ancient_order_dt | most_recent_order_dt | recency | frequency | monetary | num_orders | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | [88493] | 2017-05-16 15:05:35 | 2017-05-16 15:05:35 | 44850283.0 | 0.053939 | 146.87 | 1 |
| 1 | 2 | [90419] | 2018-01-12 20:48:24 | 2018-01-12 20:48:24 | 24007314.0 | 0.100769 | 335.48 | 1 |
| 2 | 3 | [22558] | 2018-05-19 16:07:45 | 2018-05-19 16:07:45 | 13051353.0 | 0.185360 | 157.73 | 1 |
| 3 | 4 | [32181] | 2018-03-13 16:06:38 | 2018-03-13 16:06:38 | 18840220.0 | 0.128406 | 173.3 | 1 |
| 4 | 5 | [69903] | 2018-07-29 09:51:30 | 2018-07-29 09:51:30 | 6939528.0 | 0.348612 | 252.25 | 1 |
Goals :
df_rfm.sort_values("num_orders", ascending=False)
| customer_uid | order_id_list | most_ancient_order_dt | most_recent_order_dt | recency | frequency | monetary | num_orders | |
|---|---|---|---|---|---|---|---|---|
| 14104 | 14105 | [2341, 3661, 5168, 11695, 16232, 19128, 24887,... | 2017-05-15 23:30:03 | 2018-08-20 19:14:26 | 5004952.0 | 0.915825 | 902.04 | 17 |
| 5105 | 5106 | [3006, 14074, 28926, 41710, 46195, 53989, 8032... | 2017-09-18 18:53:15 | 2018-02-27 18:36:39 | 20040819.0 | 0.639687 | 1172.67 | 9 |
| 6995 | 6996 | [8031, 22631, 24841, 53421, 65531, 75391, 83571] | 2017-11-13 16:44:41 | 2018-02-14 13:22:12 | 21182886.0 | 0.579827 | 959.01 | 7 |
| 26099 | 26100 | [5298, 10136, 25332, 27522, 59635, 81816, 86325] | 2017-10-09 12:34:39 | 2018-06-01 11:38:29 | 11944309.0 | 0.525180 | 1122.72 | 7 |
| 16760 | 16761 | [16444, 27469, 31847, 57970, 58380, 61165, 88757] | 2017-09-19 01:02:44 | 2018-06-28 00:43:34 | 9650804.0 | 0.497859 | 758.83 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32980 | 32981 | [86856] | 2018-06-01 11:20:52 | 2018-06-01 11:20:52 | 11945366.0 | 0.202522 | 133.89 | 1 |
| 32979 | 32980 | [92350] | 2017-12-02 12:07:58 | 2017-12-02 12:07:58 | 27580940.0 | 0.087713 | 112.83 | 1 |
| 32978 | 32979 | [32665] | 2017-11-12 20:29:28 | 2017-11-12 20:29:28 | 29278850.0 | 0.082626 | 47.74 | 1 |
| 32977 | 32978 | [65645] | 2017-03-03 01:23:13 | 2017-03-03 01:23:13 | 51293225.0 | 0.047164 | 142.57 | 1 |
| 96095 | 96096 | [96138] | 2017-12-19 14:27:23 | 2017-12-19 14:27:23 | 26103775.0 | 0.092676 | 21.77 | 1 |
96096 rows × 8 columns
number_orders = df_rfm["num_orders"].value_counts().to_dict()
number_orders
{1: 93099, 2: 2745, 3: 203, 4: 30, 5: 8, 6: 6, 7: 3, 9: 1, 17: 1}
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 6),
dpi=pc_dpi,
)
ax1.bar(x=list(number_orders.keys()), height=(number_orders.values()), width=1, color="navy", edgecolor="black")
###
# Titles/Lables
ax1.set_xlabel("Number of orders")
ax1.set_xticks(range(0, max(list(number_orders.keys()))))
ax1.set_ylabel("Number of customers")
fig.suptitle("Number of orders distribution in dataset")
#
###
fig.tight_layout()
plt.show()
# Excluding 0 :
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 6),
dpi=pc_dpi,
)
try:
number_orders.pop(0)
except KeyError:
pass
ax1.bar(x=list(number_orders.keys()), height=number_orders.values(), width=1, color="navy", edgecolor="black")
###
# Titles/Lables
ax1.set_xlabel("Number of orders")
ax1.set_ylabel("Number of customers")
ax1.set_xticks(range(2, max(list(number_orders.keys()))))
fig.suptitle("Number of orders per cx, exclusion of unique orders")
#
###
fig.tight_layout()
plt.show()
Clearly, Olist has either problems keeping customers "loyals" or it is easier to create a new account each visit. In any case, the amount of customers who made exactly one purchase is crushingly larger than all other customers combined, regardless of frequency (93099 customers). Excluding customers who made exactly one purchase, the majority of remaining customers order on Olist between .1 and .4 times a month (between 1.1 and 4.8 times a year). There are extremes ordering up to 1.7 times a month. But these customers are extremely rare.
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 5),
dpi=pc_dpi,
)
flierprops={"marker": "+", "markersize": 2, "markerfacecolor": "navy"}
ax1.boxplot(x="monetary", data=df_rfm, showmeans=True, vert=False, flierprops=flierprops)
###
# Titles/Lables
fig.suptitle("Customer repartition by total spent on Olist")
#
###
fig.tight_layout()
plt.show()
df_rfm["monetary"].value_counts()
0.00 676
77.57 246
35.00 160
73.34 148
116.94 125
...
28.31 1
203.23 1
139.48 1
75.74 1
92.36 1
Name: monetary, Length: 31734, dtype: int64
Lots of zeroes, either purchased is canceled or refunded. - Lets drop those and zoom on 0 -> 1K
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 5),
dpi=pc_dpi,
)
flierprops={"marker": "+", "markersize": 2, "markerfacecolor": "navy"}
ax1.boxplot(x="monetary", data=df_rfm[df_rfm["monetary"] != 0.0], showmeans=True, vert=False, flierprops=flierprops)
###
# Titles/Lables
ax1.set_xlim(0, 1000)
fig.suptitle("Customer repartition by total spent on Olist | Transaction = 0 removed & Zoom on 0-1K")
#
###
fig.tight_layout()
plt.show()
# For some reason .describe() does not work here
print("Q1 :", df_rfm["monetary"].quantile(.25))
print("Q3 :", df_rfm["monetary"].quantile(.75))
print("Median :", df_rfm["monetary"].median())
print("Avg. :", np.average(df_rfm["monetary"].values))
Q1 : 62.39 Q3 : 182.2375 Median : 107.27000000000001 Avg. : 164.87214077588146
outliers = df_rfm[df_rfm["monetary"] < 62.39]
outliers = df_rfm[df_rfm["monetary"] > 182.2375]
print("Max :", max(df_rfm["monetary"].values))
print(len(outliers))
del outliers
Max : 13664.08 24024
Most clients spend between 62.39 & 182.24 Reales (Assuming it's in Reales and not in Dollars, it's not precised in the dataset and website screenshots show that Reales is used), with the median at 107.28 and an average of 164.88 R$.
There is a wide range of outliers (24024) with spendings going from +Q3(182.2375 R$) to 13664.08 R$, but, mostly, outliers spent 4K max with a very small number of customers crossing this line.
df_rfm.describe()["recency"]
count 9.609600e+04 mean 2.490060e+07 std 1.325517e+07 min 0.000000e+00 25% 1.415973e+07 50% 2.323386e+07 75% 3.433116e+07 max 6.677370e+07 Name: recency, dtype: float64
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 6),
dpi=pc_dpi,
)
ax1.boxplot(x="recency", data=df_rfm, vert=False, showmeans=True)
###
# Titles/Lables
fig.suptitle("Distribution of customers based on their last order time:")
ax1.set_xlabel("Time Delta (seconds)")
#
###
fig.tight_layout()
plt.show()
# Not very visual so let's convert seconds to days :
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 6),
dpi=pc_dpi,
)
deltas = df_rfm["recency"]
deltas_days = np.divide(deltas, 86400)
ax1.boxplot(x=deltas_days, vert=False, showmeans=True)
###
# Titles/Lables
fig.suptitle("Distribution of customers based on their last order time :")
ax1.set_xlabel("Time Delta (days)")
#
###
fig.tight_layout()
plt.show()
deltas_days.describe()
count 96096.000000 mean 288.201370 std 153.416318 min 0.000000 25% 163.885744 50% 268.910440 75% 397.351398 max 772.843738 Name: recency, dtype: float64
grid = sns.pairplot((df_rfm[["recency", "frequency", "monetary"]]))
grid.figure.figsize = (4, 4)
grid.figure.dpi = pc_dpi
###
# Titles/Lables
grid.figure.suptitle("Pairplot between RFM variables")
#
###
grid.figure.tight_layout()
plt.show()
There are no obvious clusters distinguishable using simply RFM variables and pairploting. 3D Plotting of RFM can be attempted but it seems the data of the 3 variables will not be sufficient to offer what the customer (Olist) is looking for. The Wikipedia page regarding Olist mentions that there are 2M+ unique active customers, so it looks like we only have a small fragment of their database : we know we have 96K accounts and it exists 2M+ accounts, but we are not aware of other useful stastistics, like the number of order, which would explain why our frequency is = 0 in close to 99% of the customers.
We hope to obtain visual clusters to identify and classify customers using the MK1 eyeball without any clustering algorithms or data engineering
labels_dict ={
"recency": "Recency (in seconds)",
"frequency": "Frequncy of purchase(s)",
"monetary": "Total Spent on Olist"
}
marker_style = {
"color": 'navy',
"size": 5,
}
fig = px.scatter_3d(
data_frame=df_rfm, x="recency",
y="frequency", z="monetary",
width=5 * pc_dpi, height=3 * pc_dpi,
labels=labels_dict,
)
fig.update_layout(
margin=dict(l=40, r=40, t=40, b=40),
title="3D Representation of customers, RFM approach",
)
fig.update_traces(marker=marker_style)
fig.show()
Following the inconclusive results (as expected) of part #3, Machine Learning classification algorithms can be applied to determine if clusters can be found.
0 -> 10 or 0 -> 5 will be better suited for understanding data like reviews and ratings
mms = MinMaxScaler(feature_range=(0, 10))
dropcols = ["order_id_list", "most_ancient_order_dt", "most_recent_order_dt", "num_orders"]
df_rfm_mms = df_rfm.drop(columns=dropcols)
df_rfm_mms.set_index("customer_uid", inplace=True)
keepcols = df_rfm_mms.columns
df_rfm_mms = mms.fit_transform(df_rfm_mms.to_numpy())
df_rfm_mms = pd.DataFrame(df_rfm_mms, columns=keepcols)
df_rfm_mms.head()
| recency | frequency | monetary | |
|---|---|---|---|
| 0 | 6.716759 | 0.316991 | 0.107486 |
| 1 | 3.595325 | 0.592200 | 0.245520 |
| 2 | 1.954565 | 1.089323 | 0.115434 |
| 3 | 2.821503 | 0.754616 | 0.126829 |
| 4 | 1.039261 | 2.048718 | 0.184608 |
Trial and error will be used to determine the best k-range to pass to the optimizer
k_range = range(2, 9)
k_means_optimizer(data=df_rfm_mms, k_range=k_range)
4 and 5 both seem to be the optimal k-numbers of clusters, the error keeps diminishing with the increasing number of clusters but the silhouette score is acceptable, performing k-means with k=4 (k=5 doesn't look as good) and plotting the results
It seems the silhouette score calculation is what makes the optimizer takes it's sweet sweet time, if not needed for comparison, it can be considered to rely solely on inertia
km = KMeans(n_clusters=4)
y_predicted = km.fit_predict(df_rfm_mms)
df_rfm_mms["cluster_4"] = y_predicted
labels_dict ={
"recency": "Recency",
"frequency": "Frequency of purchase(s)",
"monetary": "Total Spent on Olist"
}
marker_style = {
"size": 5,
}
fig = px.scatter_3d(
data_frame=df_rfm_mms, x="recency",
y="frequency", z="monetary", color="cluster_4",
width=4 * pc_dpi, height=3 * pc_dpi,
labels=labels_dict,
)
fig.update_layout(
margin=dict(l=40, r=40, t=40, b=40),
title="3D Representation of customers, RFM approach, k-means clustering, k=4",
)
fig.update_traces(marker=marker_style)
fig.show()
Even if the clustering is not crystal clear, we can see a somewhat clear pattern, to be expected. :
Even if it is too early to tell, we can speculate and maybe try to apply the "Law of Diffusion of Innovation", Everett Rogers' 1968 theory and reapplied to business models by the now renowned business writer and public speaker Simon Sinek.
This theory states that :
We can quantify our clusters and see if we can indeed see what looks like this distribution : group 1 & 2 believing strongly in the idea (cluster 1), group 3 & 4 being the majority (cluster 0 and 2) and group 5 (cluster 3) being the laggards.
inv = int(input("Investors : "))
maj1 = int(input("Early Majority : "))
maj2 = int(input("Late Majority : "))
lagg = int(input("Laggards : "))
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 6),
dpi=pc_dpi,
)
amounts = df_rfm_mms["cluster_4"].value_counts()
amounts_inv = amounts[inv]
amount_maj_1 = amounts[maj1]
amount_maj_2 = amounts[maj2]
amounts_maj = amount_maj_1 + amount_maj_2
amount_lag = amounts[lagg]
total = len(df_rfm_mms)
cluster_dict = {
f"Investors-{round((amounts_inv / total) * 100, ndigits=2)}%": amounts_inv,
f"Majority-{round((amounts_maj / total) * 100, ndigits=2)}%": amounts_maj,
f"Laggards-{round((amount_lag / total) * 100, ndigits=2)}%": amount_lag
}
my_colors = ["royalblue", "#003153", "red"]
ax1.bar(
x=list(cluster_dict.keys()),
height=list(cluster_dict.values()),
color=my_colors
)
###
# Titles/Lables
ax1.set_xlabel("Law of Diffusion Classes (with respective percentages)")
ax1.set_ylabel("Customer per Classes")
fig.suptitle("Expression of Customers theorizing the Law of Diffusion of Innovation")
#
###
fig.tight_layout()
plt.show()
The above graph seems to confirm the intuition that Olist's customers follow indeed the Law of Diffusion of Innovation (or consumption here). It is very interesting to see that the cuts between the classes are clear and provide a potential course of action for Olist. Indeed, according to the Law of Diffusion of Innovation :
DBSCAN is another algorithm testable on this dataset. We first need to determine the best max intra cluster distance (epsilon) and the best minimal amount of points it takes to make a cluster, this is often dimension + 1 or dimension * 2 Epsilon can be determined using k neighbors. Using a graph to represent the avg distance between a point and its k-neighbors (here 4 : dimension + 1). Zooming in and using the elbow method help us to focus on the best potential epsilon.
neighbors_matrix = df_rfm_mms[["recency", "frequency", "monetary"]].to_numpy()
nneighbors = NearestNeighbors(n_neighbors=4, n_jobs=-1) # dataset dim + 1
nneighbors.fit(X=neighbors_matrix)
distances, potential_eps = nneighbors.kneighbors(neighbors_matrix)
distances = np.sort(distances, axis=0)
distances_plot = distances[:,1]
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(16, 8),
dpi=pc_dpi,
)
ax1.plot(distances_plot)
###
# Titles/Lables
ax1.set_xlabel("Object")
ax1.set_ylabel("k distance")
fig.suptitle("Points sorted by distance - Neighbors = 4")
#
###
fig.tight_layout()
plt.show()
# Zooming up until we can see the "eblow"
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(16, 8),
dpi=pc_dpi,
)
ax1.plot(distances_plot)
###
# Titles/Lables
ax1.set_xlabel("Object")
ax1.set_ylabel("k distance")
ax1.set_xlim((96020, 96100))
ax1.set_yticks(np.arange(0, 5, 0.1))
ax1.grid(visible=True, axis="both")
fig.suptitle("Points sorted by distance - Neighbors = 4")
#
###
fig.tight_layout()
plt.show()
# Looks like .8 is the best candidate
best_dbs = DBSCAN(eps=0.8, min_samples=4, n_jobs=-1)
y_predict = best_dbs.fit_predict(
df_rfm_mms.drop(
columns=["cluster", "cluster_4", "cluster_5", "cluster_DBSCAN"], errors="ignore"
)
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
df_rfm_mms.loc[:, "cluster_DBSCAN"] = y_predict
labels_dict ={
"recency": "Recency",
"frequency": "Frequency of purchase(s)",
"monetary": "Total Spent on Olist"
}
marker_style = {
"size": 5,
}
fig = px.scatter_3d(
data_frame=df_rfm_mms, x="recency",
y="frequency", z="monetary", color="cluster_DBSCAN",
width=4 * pc_dpi, height=3 * pc_dpi,
labels=labels_dict,
)
fig.update_layout(
margin=dict(l=40, r=40, t=40, b=40),
title="3D Representation of customers, RFM approach, DBSCAN clustering",
)
fig.update_traces(marker=marker_style)
fig.show()
DBSCAN clustering is disappointing here. It identifies outliers rather than smaller groups. It doesn't help to narrow down the potential targets for any campaign. It is unclear whether or not my parameters are not correct or if the algorithm is not helpful in this case.
A final clustering attempt can be made using Agglomerative Clustering.
Agglomerative Clustering might help identify pre-determined clusters with the help of the linkage distance, which we can use from Scipy The objective is to produce at least a similar result to K-Means, if not, it is not worth using and updating at this stage.
Quoting sources : If you want to create flat clusters we can analyze the [...] dendrogram to determine no. of clusters. We first assume that the horizontal lines are extended on both sides, and as such, they would also cross the vertical lines. Now we have to identify the tallest vertical line that does not have any horizontal line crossing through it.
## Due to Apple m1 chip being a little stubborn, we need to sample the group or the overpriced piece of aluminium is gonna melt
## Taking representatives from classes from k-means, sample size = 2K
sample_inv = df_rfm_mms[df_rfm_mms["cluster_4"] == inv].sample(round((amounts_inv / len(df_rfm_mms)) * 2000))
sample_maj1 = df_rfm_mms[df_rfm_mms["cluster_4"] == maj1].sample(round((amount_maj_1 / len(df_rfm_mms)) * 2000))
sample_maj2 = df_rfm_mms[df_rfm_mms["cluster_4"] == maj2].sample(round((amount_maj_2 / len(df_rfm_mms)) * 2000))
sample_lagg = df_rfm_mms[df_rfm_mms["cluster_4"] == lagg].sample(round((amount_lag / len(df_rfm_mms)) * 2000))
df_rfm_mms_2ksample = pd.concat([sample_inv, sample_maj1, sample_maj2, sample_lagg]).sort_index()
len(df_rfm_mms_2ksample)
1999
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(8, 4),
dpi=pc_dpi,
)
linkage_method = hierarchy.linkage(
df_rfm_mms_2ksample.drop(columns=["cluster", "cluster_4", "cluster_5", "cluster_DBSCAN"], errors="ignore"),
method ="ward",
metric="euclidean",
)
dendrogram_plot = hierarchy.dendrogram(linkage_method, ax=ax1, no_labels=True)
###
# Titles/Lables
#
###
fig.tight_layout()
plt.show()
# Lets say its 3 ?
agg = AgglomerativeClustering(n_clusters=3)
#Let's try the small one first
df_rfm_mms_2ksample["cluster_agg"] = agg.fit_predict(df_rfm_mms_2ksample.drop(columns=["cluster_4", "cluster_DBSCAN"], errors="ignore"))
df_rfm_mms_2ksample.head()
| recency | frequency | monetary | cluster_4 | cluster_DBSCAN | cluster_agg | |
|---|---|---|---|---|---|---|
| 4 | 1.039261 | 2.048718 | 0.184608 | 1 | 0 | 0 |
| 101 | 3.807008 | 0.559272 | 0.158532 | 0 | 0 | 1 |
| 195 | 4.129577 | 0.515586 | 0.052839 | 0 | 0 | 1 |
| 202 | 0.999133 | 2.130999 | 0.251565 | 1 | 0 | 0 |
| 235 | 4.001812 | 0.532047 | 0.094364 | 0 | 0 | 1 |
labels_dict ={
"recency": "Recency",
"frequency": "Frequency of purchase(s)",
"monetary": "Total Spent on Olist"
}
marker_style = {
"size": 5,
}
fig = px.scatter_3d(
data_frame=df_rfm_mms_2ksample, x="recency",
y="frequency", z="monetary", color="cluster_agg",
width=4 * pc_dpi, height=3 * pc_dpi,
labels=labels_dict,
)
fig.update_layout(
margin=dict(l=40, r=40, t=40, b=40),
title="3D Representation of customers, RFM approach, Agglomerative clustering - 2K sample",
)
fig.update_traces(marker=marker_style)
fig.show()
This clustering method could be fine, and sure, it works on a 2K sample but an I7 4.3GhZ w/ 32Gb RAM couldn't hack it in an hour for 100K. So it is not in the interest of Olist to use an algorithm that demanding for their 2M unique customers.
Test has been done on Desktop using :
Not all of this points might be relevant but including them in the model could make a difference.
Will be exported :
# Export :
df_rfm_mms.head()
| recency | frequency | monetary | cluster_4 | cluster_DBSCAN | |
|---|---|---|---|---|---|
| 0 | 6.716759 | 0.316991 | 0.107486 | 2 | 0 |
| 1 | 3.595325 | 0.592200 | 0.245520 | 3 | 0 |
| 2 | 1.954565 | 1.089323 | 0.115434 | 3 | 0 |
| 3 | 2.821503 | 0.754616 | 0.126829 | 3 | 0 |
| 4 | 1.039261 | 2.048718 | 0.184608 | 1 | 0 |
df_rfm.head()
| customer_uid | order_id_list | most_ancient_order_dt | most_recent_order_dt | recency | frequency | monetary | num_orders | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | [88493] | 2017-05-16 15:05:35 | 2017-05-16 15:05:35 | 44850283.0 | 0.053939 | 146.87 | 1 |
| 1 | 2 | [90419] | 2018-01-12 20:48:24 | 2018-01-12 20:48:24 | 24007314.0 | 0.100769 | 335.48 | 1 |
| 2 | 3 | [22558] | 2018-05-19 16:07:45 | 2018-05-19 16:07:45 | 13051353.0 | 0.185360 | 157.73 | 1 |
| 3 | 4 | [32181] | 2018-03-13 16:06:38 | 2018-03-13 16:06:38 | 18840220.0 | 0.128406 | 173.3 | 1 |
| 4 | 5 | [69903] | 2018-07-29 09:51:30 | 2018-07-29 09:51:30 | 6939528.0 | 0.348612 | 252.25 | 1 |
# one exception, hence try / except
def get_kmeans(row):
try:
return int(df_rfm_mms.at[row["customer_uid"], "cluster_4"])
except KeyError:
pass
def get_dbscan(row):
try:
return df_rfm_mms.at[row["customer_uid"], "cluster_DBSCAN"]
except KeyError:
pass
df_rfm["cluster_kmeans_4"] = df_rfm.apply(get_kmeans, axis=1)
df_rfm["cluster_DBSCAN"] = df_rfm.apply(get_dbscan, axis=1)
df_rfm.dropna(subset=["cluster_kmeans_4", "cluster_DBSCAN"], inplace=True)
df_rfm["cluster_kmeans_4"] = df_rfm["cluster_kmeans_4"].astype(int)
df_rfm["cluster_DBSCAN"] = df_rfm["cluster_DBSCAN"].astype(int)
def get_name(grp_nbr: int)-> str:
if grp_nbr == inv:
return "Investors"
elif grp_nbr == maj1:
return "Early Majority"
elif grp_nbr == maj2:
return "Late Majority"
elif grp_nbr == lagg:
return "Laggards"
else:
raise Exception("group number not defined")
def addlabels_1(x, y):
for i in range(len(x)):
plt.text(i, y[i], f"{round(y[i], ndigits=2)} R$", ha="center", bbox=dict(facecolor="white", alpha=1))
def addlabels(x, y):
for i in range(len(x)):
plt.text(i, y[i], f"{y[i]} R$", ha="center", bbox=dict(facecolor="white", alpha=1))
# Total per cluster :
k_means_clusters = df_rfm["cluster_kmeans_4"].unique().tolist()
Group_money = namedtuple("Group_money", field_names=["group_name", "group_total", "avg_p_capita"])
group_spending = []
for cluster in k_means_clusters:
total_spent = df_rfm[df_rfm["cluster_kmeans_4"] == cluster]["monetary"].values.sum()
avg_pc = round(np.average(df_rfm[df_rfm["cluster_kmeans_4"] == cluster]["monetary"]), ndigits=2)
group_name = get_name(cluster)
group_spending.append(Group_money(group_name=group_name, group_total=total_spent, avg_p_capita=avg_pc))
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 8),
dpi=pc_dpi,
)
ax1.bar(
x=[group.group_name for group in group_spending],
height=[group.group_total for group in group_spending],
width=.8, color=["royalblue", "#003153", "navy", "red"], edgecolor="black",
)
###
# Titles/Lables
addlabels_1(x=[group.group_name for group in group_spending], y=[group.group_total for group in group_spending])
ax1.set(yticklabels=[])
ax1.tick_params(left=False)
fig.suptitle("Total spent by clusters determined by k-means w/ k=4")
#
###
fig.tight_layout()
plt.show()
As expected, the majority is the group spending the most, as they represent the two largest clusters, it is to be expected. The Early Majority seems to be spending more than the late majority.
Surprisingly, the most loyal customers seem to spend as much as the laggards, with clusters of approximately the same size that is surprising.
Plotting the avg spending per member might help clarify these observations.
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(12, 8),
dpi=pc_dpi,
)
ax1.bar(
x=[group.group_name for group in group_spending],
height=[group.avg_p_capita for group in group_spending],
width=.8, color=["royalblue", "#003153", "navy", "red"], edgecolor="black",
)
###
# Titles/Lables
addlabels(x=[group.group_name for group in group_spending], y=[group.avg_p_capita for group in group_spending])
ax1.set(yticklabels=[])
ax1.tick_params(left=False)
fig.suptitle("Spending per capita by clusters determined by k-means w/ k=4")
#
###
fig.tight_layout()
plt.show()
Ok so thats surprising. Each group has an average spending per capita more or less equal, stdeviation of only around 5R$. The determining factor in clustering was not monetary surely, mostly frequency and recency. We will build a more robust model to identify that phenomenon.
def get_k_cluster_names(row):
cluster = row["cluster_kmeans_4"]
if cluster == inv:
return "investors"
elif cluster == maj1:
return "early_majority"
elif cluster == maj2:
return "late_majority"
elif cluster == lagg:
return "laggards"
df_rfm["k_cluster_name"] = df_rfm.apply(get_k_cluster_names, axis=1)
df_rfm.head()
| customer_uid | order_id_list | most_ancient_order_dt | most_recent_order_dt | recency | frequency | monetary | num_orders | cluster_kmeans_4 | cluster_DBSCAN | k_cluster_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | [88493] | 2017-05-16 15:05:35 | 2017-05-16 15:05:35 | 44850283.0 | 0.053939 | 146.87 | 1 | 3 | 0 | early_majority |
| 1 | 2 | [90419] | 2018-01-12 20:48:24 | 2018-01-12 20:48:24 | 24007314.0 | 0.100769 | 335.48 | 1 | 3 | 0 | early_majority |
| 2 | 3 | [22558] | 2018-05-19 16:07:45 | 2018-05-19 16:07:45 | 13051353.0 | 0.185360 | 157.73 | 1 | 3 | 0 | early_majority |
| 3 | 4 | [32181] | 2018-03-13 16:06:38 | 2018-03-13 16:06:38 | 18840220.0 | 0.128406 | 173.3 | 1 | 1 | 0 | investors |
| 4 | 5 | [69903] | 2018-07-29 09:51:30 | 2018-07-29 09:51:30 | 6939528.0 | 0.348612 | 252.25 | 1 | 0 | 0 | late_majority |
df_rfm.to_csv(path_or_buf="../data/optimized/cx_rfm.csv", index=False)
df_rfm.to_pickle(path="../pickles/cx_rfm.pkl")